Mozilla-Ocho
diff --git a/‎llamafile/chatbot_eval.cpp
Lines changed: 18 additions & 14 deletions b/‎llamafile/chatbot_eval.cpp
Lines changed: 18 additions & 14 deletions
diff --git a/‎llamafile/chatbot_main.cpp
Lines changed: 2 additions & 1 deletion b/‎llamafile/chatbot_main.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎llamafile/flags.cpp
Lines changed: 1 addition & 1 deletion b/‎llamafile/flags.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎llamafile/llama.cpp
Lines changed: 38 additions & 0 deletions b/‎llamafile/llama.cpp
Lines changed: 38 additions & 0 deletions
diff --git a/‎llamafile/llama.h
Lines changed: 8 additions & 1 deletion b/‎llamafile/llama.h
Lines changed: 8 additions & 1 deletion
diff --git a/‎llamafile/server/BUILD.mk
Lines changed: 39 additions & 26 deletions b/‎llamafile/server/BUILD.mk
Lines changed: 39 additions & 26 deletions
diff --git a/‎llamafile/server/atob.cpp
Lines changed: 0 additions & 1 deletion b/‎llamafile/server/atob.cpp
Lines changed: 0 additions & 1 deletion
@@ -16,17 +16,17 @@
 // limitations under the License.
 
 #include "chatbot.h"
-
-#include <cassert>
-#include <string>
-#include <vector>
-
+#include "llama.cpp/base64.h"
 #include "llama.cpp/common.h"
 #include "llama.cpp/llama.h"
 #include "llama.cpp/llava/llava.h"
 #include "llamafile/datauri.h"
 #include "llamafile/image.h"
+#include "llamafile/llama.h"
 #include "llamafile/string.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 namespace lf {
 namespace chatbot {
@@ -60,7 +60,7 @@ bool eval_tokens(std::vector<llama_token> tokens) {
     return true;
 }
 
-bool eval_image_embed(const struct llava_image_embed *image_embed) {
+bool eval_image_embed(const llava_image_embed *image_embed) {
     int N = image_embed->n_image_pos;
     if (tokens_used() + N > llama_n_ctx(g_ctx))
         return out_of_context(N);
@@ -113,7 +113,7 @@ bool eval_token(int id) {
 }
 
 bool eval_plain_text(const std::string &str, bool add_special, bool parse_special) {
-    return eval_tokens(llama_tokenize(g_model, str, add_special, parse_special));
+    return eval_tokens(llamafile_tokenize(g_model, str, add_special, parse_special));
 }
 
 bool eval_string(std::string_view s, bool add_special, bool parse_special) {
@@ -122,22 +122,26 @@ bool eval_string(std::string_view s, bool add_special, bool parse_special) {
         size_t pos = s.find("data:", i);
         if (pos == std::string_view::npos)
             return eval_plain_text(std::string(s), add_special, parse_special);
+        i = pos + 5;
         DataUri uri;
         size_t end = uri.parse(s.substr(pos + 5));
-        if (end == std::string_view::npos) {
-            i = pos + 5;
+        if (end == std::string_view::npos)
             continue;
-        }
-        std::string image = uri.decode();
-        if (!is_image(image)) {
-            i = pos + 5;
+        if (!uri.mime.starts_with("image/"))
+            continue;
+        std::string image;
+        try {
+            image = uri.decode();
+        } catch (const base64_error &e) {
             continue;
         }
+        if (!is_image(image))
+            continue;
         if (!eval_plain_text(std::string(s.substr(0, pos)), add_special, parse_special))
             return false;
         if (!eval_image(image))
             return false;
-        s = s.substr(pos + 5 + end);
+        s = s.substr(i + end);
         i = 0;
     }
 }
 
@@ -29,6 +29,7 @@
 #include "llama.cpp/server/server.h"
 #include "llamafile/color.h"
 #include "llamafile/compute.h"
+#include "llamafile/llama.h"
 #include "llamafile/string.h"
 
 namespace lf {
@@ -69,7 +70,7 @@ std::string describe_compute(void) {
 std::string token_to_piece(const struct llama_context *ctx, llama_token token, bool special) {
     if (token == IMAGE_PLACEHOLDER_TOKEN)
         return "⁑";
-    return llama_token_to_piece(ctx, token, special);
+    return llamafile_token_to_piece(ctx, token, special);
 }
 
 void on_server_listening(const char *host, int port) {
 
@@ -64,7 +64,7 @@ int FLAG_batch = 2048;
 int FLAG_ctx_size = 8192;
 int FLAG_flash_attn = false;
 int FLAG_gpu = 0;
-int FLAG_http_ibuf_size = 1024 * 1024;
+int FLAG_http_ibuf_size = 5 * 1024 * 1024;
 int FLAG_http_obuf_size = 1024 * 1024;
 int FLAG_keepalive = 5;
 int FLAG_main_gpu = 0;
 
@@ -16,10 +16,48 @@
 // limitations under the License.
 
 #include "llama.h"
+#include "llama.cpp/llama.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 int llamafile_token_eot(llama_model *model) {
     llama_token eot = llama_token_eot(model);
     if (eot != -1)
         return eot;
     return llama_token_eos(model);
 }
+
+std::string llamafile_token_to_piece(const llama_context *ctx, llama_token token, bool special) {
+    std::string piece;
+    piece.resize(piece.capacity());
+    const int n_chars =
+        llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+    if (n_chars < 0) {
+        piece.resize(-n_chars);
+        int check =
+            llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
+        unassert(check == -n_chars);
+    } else {
+        piece.resize(n_chars);
+    }
+    return piece;
+}
+
+std::vector<llama_token> llamafile_tokenize(const struct llama_model *model,
+                                            const std::string_view &text, bool add_special,
+                                            bool parse_special) {
+    int n_tokens = text.size() + 2 * add_special;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(model, text.data(), text.size(), result.data(), result.size(),
+                              add_special, parse_special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(model, text.data(), text.size(), result.data(), result.size(),
+                                   add_special, parse_special);
+        unassert(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
@@ -16,7 +16,8 @@
 // limitations under the License.
 
 #pragma once
-#include "llama.cpp/llama.h"
+#include <__fwd/string_view.h>
+#include <__fwd/vector.h>
 
 // Many llama.cpp APIs take boolean parameters at the end. Please favor
 // passing these constants as arguments instead, for better readability
@@ -36,4 +37,10 @@
 #define RENDER_SPECIAL_TOKENS true
 #define DONT_RENDER_SPECIAL_TOKENS false
 
+struct llama_model;
+struct llama_context;
+
 int llamafile_token_eot(llama_model *);
+
+std::string llamafile_token_to_piece(const llama_context *, int, bool);
+std::vector<int> llamafile_tokenize(const llama_model *, const std::string_view &, bool, bool);
@@ -12,17 +12,17 @@ LLAMAFILE_SERVER_ASSETS = $(wildcard llamafile/server/www/*)
 
 $(LLAMAFILE_SERVER_OBJS): private CCFLAGS += -g
 
-o/$(MODE)/llamafile/server/server.a:				\
+o/$(MODE)/llamafile/server/server.a:					\
 		$(filter-out %_test.o,$(LLAMAFILE_SERVER_OBJS))
 
-o/$(MODE)/llamafile/server/main:				\
-		o/$(MODE)/llamafile/server/main.o		\
-		o/$(MODE)/llamafile/server/main.1.asc.zip.o	\
-		o/$(MODE)/llamafile/server/server.a		\
-		o/$(MODE)/llama.cpp/llama.cpp.a			\
-		o/$(MODE)/llama.cpp/llava/llava.a		\
-		o/$(MODE)/double-conversion/double-conversion.a	\
-		o/$(MODE)/stb/stb.a				\
+o/$(MODE)/llamafile/server/main:					\
+		o/$(MODE)/llamafile/server/main.o			\
+		o/$(MODE)/llamafile/server/main.1.asc.zip.o		\
+		o/$(MODE)/llamafile/server/server.a			\
+		o/$(MODE)/llama.cpp/llama.cpp.a				\
+		o/$(MODE)/llama.cpp/llava/llava.a			\
+		o/$(MODE)/double-conversion/double-conversion.a		\
+		o/$(MODE)/stb/stb.a					\
 		$(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o)
 
 # turn /zip/llamafile/server/www/...
@@ -31,24 +31,37 @@ $(LLAMAFILE_SERVER_ASSETS:%=o/$(MODE)/%.zip.o): private ZIPOBJ_FLAGS += -C2
 
 $(LLAMAFILE_SERVER_OBJS): llamafile/server/BUILD.mk
 
-o/$(MODE)/llamafile/server/fastjson_test:			\
-		o/$(MODE)/llamafile/server/fastjson_test.o	\
-		o/$(MODE)/llamafile/server/fastjson.o		\
-		o/$(MODE)/double-conversion/double-conversion.a	\
+o/$(MODE)/llamafile/server/atom_test:					\
+		o/$(MODE)/llamafile/server/atom_test.o			\
+		o/$(MODE)/llamafile/server/atom.o			\
+		o/$(MODE)/llamafile/server/image.o			\
 
-o/$(MODE)/llamafile/server/json_test:				\
-		o/$(MODE)/llamafile/server/json_test.o		\
-		o/$(MODE)/llamafile/server/json.o		\
-		o/$(MODE)/llamafile/server/hextoint.o		\
-		o/$(MODE)/double-conversion/double-conversion.a	\
+o/$(MODE)/llamafile/server/image_test:					\
+		o/$(MODE)/llamafile/server/image_test.o			\
+		o/$(MODE)/llamafile/server/image.o			\
 
-o/$(MODE)/llamafile/server/tokenbucket_test:			\
-		o/$(MODE)/llamafile/server/tokenbucket_test.o	\
-		o/$(MODE)/llamafile/server/tokenbucket.o	\
-		o/$(MODE)/llamafile/server/log.o		\
-		o/$(MODE)/llama.cpp/llama.cpp.a			\
+o/$(MODE)/llamafile/server/fastjson_test:				\
+		o/$(MODE)/llamafile/server/fastjson_test.o		\
+		o/$(MODE)/llamafile/server/fastjson.o			\
+		o/$(MODE)/double-conversion/double-conversion.a		\
+
+o/$(MODE)/llamafile/server/json_test:					\
+		o/$(MODE)/llamafile/server/json_test.o			\
+		o/$(MODE)/llamafile/server/json.o			\
+		o/$(MODE)/llamafile/server/hextoint.o			\
+		o/$(MODE)/double-conversion/double-conversion.a		\
+
+o/$(MODE)/llamafile/server/tokenbucket_test:				\
+		o/$(MODE)/llamafile/server/tokenbucket_test.o		\
+		o/$(MODE)/llamafile/server/tokenbucket.o		\
+		o/$(MODE)/llamafile/server/log.o			\
+		o/$(MODE)/llama.cpp/llama.cpp.a				\
 
 .PHONY: o/$(MODE)/llamafile/server
-o/$(MODE)/llamafile/server:					\
-		o/$(MODE)/llamafile/server/main			\
-		o/$(MODE)/llamafile/server/json_test.runs	\
+o/$(MODE)/llamafile/server:						\
+		o/$(MODE)/llamafile/server/main				\
+		o/$(MODE)/llamafile/server/atom_test.runs		\
+		o/$(MODE)/llamafile/server/fastjson_test.runs		\
+		o/$(MODE)/llamafile/server/image_test.runs		\
+		o/$(MODE)/llamafile/server/json_test.runs		\
+		o/$(MODE)/llamafile/server/tokenbucket_test.runs	\
@@ -16,7 +16,6 @@
 // limitations under the License.
 
 #include "utils.h"
-
 #include <string_view>
 
 bool